RideShare Tip Information and Mapping Density in Downtown Chicago

Import City of Chicago Sample Data from 100 Level Code

You will need to populate the google api with your key

Import data file created from 100 level code which pulls from city of chicago (coc)

#set new directory
setwd("~/Class/Winter/DataViz/Project/")
#import data
coc <- read_csv("full2019.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Trip.ID = col_character(),
##   Trip.Start.Timestamp = col_character(),
##   Trip.End.Timestamp = col_character(),
##   Shared.Trip.Authorized = col_logical(),
##   Pickup.Centroid.Location = col_character(),
##   Dropoff.Centroid.Location = col_character()
## )
## See spec(...) for full column specifications.
summary(coc)
##        X1          Trip.ID          Trip.Start.Timestamp Trip.End.Timestamp
##  Min.   :    1   Length:52000       Length:52000         Length:52000      
##  1st Qu.:13001   Class :character   Class :character     Class :character  
##  Median :26000   Mode  :character   Mode  :character     Mode  :character  
##  Mean   :26000                                                             
##  3rd Qu.:39000                                                             
##  Max.   :52000                                                             
##                                                                            
##   Trip.Seconds     Trip.Miles      Pickup.Census.Tract Dropoff.Census.Tract
##  Min.   :    0   Min.   :  0.000   Min.   :1.703e+10   Min.   :1.703e+10   
##  1st Qu.:  543   1st Qu.:  1.800   1st Qu.:1.703e+10   1st Qu.:1.703e+10   
##  Median :  878   Median :  3.792   Median :1.703e+10   Median :1.703e+10   
##  Mean   : 1093   Mean   :  6.281   Mean   :1.703e+10   Mean   :1.703e+10   
##  3rd Qu.: 1417   3rd Qu.:  7.900   3rd Qu.:1.703e+10   3rd Qu.:1.703e+10   
##  Max.   :19311   Max.   :203.400   Max.   :1.703e+10   Max.   :1.703e+10   
##  NA's   :96                        NA's   :14768       NA's   :15027       
##  Pickup.Community.Area Dropoff.Community.Area      Fare       
##  Min.   : 1.00         Min.   : 1.00          Min.   :  0.00  
##  1st Qu.: 8.00         1st Qu.: 8.00          1st Qu.:  5.00  
##  Median :24.00         Median :24.00          Median : 10.00  
##  Mean   :25.86         Mean   :26.29          Mean   : 12.07  
##  3rd Qu.:32.00         3rd Qu.:32.00          3rd Qu.: 15.00  
##  Max.   :77.00         Max.   :77.00          Max.   :235.00  
##  NA's   :3249          NA's   :3777                           
##       Tip          Additional.Charges   Trip.Total     Shared.Trip.Authorized
##  Min.   : 0.0000   Min.   : 0.000     Min.   :  0.67   Mode :logical         
##  1st Qu.: 0.0000   1st Qu.: 2.550     1st Qu.:  7.55   FALSE:41930           
##  Median : 0.0000   Median : 2.550     Median : 12.55   TRUE :10070           
##  Mean   : 0.6448   Mean   : 3.041     Mean   : 15.76                         
##  3rd Qu.: 0.0000   3rd Qu.: 2.550     3rd Qu.: 17.55                         
##  Max.   :44.0000   Max.   :33.120     Max.   :289.84                         
##                                                                              
##   Trips.Pooled    Pickup.Centroid.Latitude Pickup.Centroid.Longitude
##  Min.   : 1.000   Min.   :41.65            Min.   :-87.91           
##  1st Qu.: 1.000   1st Qu.:41.88            1st Qu.:-87.68           
##  Median : 1.000   Median :41.89            Median :-87.65           
##  Mean   : 1.225   Mean   :41.89            Mean   :-87.67           
##  3rd Qu.: 1.000   3rd Qu.:41.93            3rd Qu.:-87.63           
##  Max.   :11.000   Max.   :42.02            Max.   :-87.53           
##                   NA's   :3207             NA's   :3207             
##  Pickup.Centroid.Location Dropoff.Centroid.Latitude Dropoff.Centroid.Longitude
##  Length:52000             Min.   :41.65             Min.   :-87.91            
##  Class :character         1st Qu.:41.88             1st Qu.:-87.68            
##  Mode  :character         Median :41.89             Median :-87.65            
##                           Mean   :41.89             Mean   :-87.67            
##                           3rd Qu.:41.93             3rd Qu.:-87.63            
##                           Max.   :42.02             Max.   :-87.53            
##                           NA's   :3740              NA's   :3740              
##  Dropoff.Centroid.Location
##  Length:52000             
##  Class :character         
##  Mode  :character         
##                           
##                           
##                           
## 
  • Look at the mean tip - it is only .65 cents per ride. Ouch. Not a big tipping industry

Some Data Cleaning.

Just in case 100 file is old or more general help check

#Look for hourly patterns
hourly_data <- coc
# Clean up bad column names
hourly_data <- hourly_data %>% clean_names()
#na.omit(hourly_data)
#summary(hourly_data)
hourly_data$hour <- hour(as.POSIXlt(hourly_data$trip_start_timestamp, format="%m/%d/%Y %I:%M:%S %p"))       
  • Look at the tip mean. 64.4 cents!!

Look into Tip Information

Yes, I’m going to make a pie chart below. It will highlight that only 20% of riders tip. 80% leave no tip! This is the only time I would use a pie chart. Showing two or three numbers that are out of balance or equal weights.
This was harder than I would have guested. I had to do a lot of numberic conversions below. Adding the title must have taken 1 hour or more.
tips <- subset(hourly_data, hourly_data$tip > 0)
hourly_data$isTipper <- FALSE
hourly_data$isTipper[hourly_data$tip > 0] <- TRUE

my_pie <- data.frame(
  group=c("Tippers", "NoTips"),
  value2=c(toString(count(tips)),toString(count(hourly_data)-count(tips)))
  )

my_pie$value <- as.numeric(as.character(my_pie$value))
my_pie <- my_pie[,-c(2)]

head(my_pie)
##     group value
## 1 Tippers  9961
## 2  NoTips 42039
#ggplot(my_pie, aes(x="", y=value, fill=group)) +
#  geom_bar(stat="identity", width=1, color="white") +
#  coord_polar("y", start=0) +
#  theme_void()

df <- my_pie %>%
   # factor levels need to be the opposite order of the cumulative sum of the values
   mutate(group = group,
          cumulative = cumsum(value),
          midpoint = cumulative - value / 2,
          label = paste0(group, " ", round(value / sum(value) * 100, 1), "%"))

ggplot(df, aes(x = 1, weight = value, fill = group)) +
   geom_bar(width = 1, position = "stack") +
   coord_polar(theta = "y") +
   geom_text(aes(x = 1.3, y = midpoint, label = label)) +
   theme(axis.ticks = element_blank(),
         axis.text = element_blank(),
         axis.title = element_blank(),
         plot.caption = element_text(hjust = 0.5)) +
   ggtitle("Percentage of Riders Who Tip") 

#general pie chart - no ggplot
#pie(my_pie$value, my_pie$group)
percTip <- count(tips) / (count(hourly_data)) 
percTip
##           n
## 1 0.1915577
  • Above we show most pepople don’t tip.

Let’s look at those that do tip. How are they doing percentage wise?

fare vs tip - what % are drivers getting?

ggplot (hourly_data, aes(x=hourly_data$fare, y=tip)) +geom_point()

#### look at historgram without zero tips.

ggplot (tips, aes(x=tips$tip)) +geom_histogram(bins = 24)

#### First look at historgram without zero tips.

ggplot (hourly_data, aes(x=hourly_data$shared_trip_authorized, y=tip))+ geom_boxplot()

Look into rides by hour

Histogram of Rides by Hour below

ggplot (hourly_data, aes(x=hourly_data$hour)) +geom_histogram(bins = 24)

#ggplot (hourly_data, aes(x=hourly_data$trip_start_timestamp, y=`trip_total`))+ geom_col()

Pick-Up Density

Now let’s look at all the pick-up locations:

# Chicago = 41.8781° N, -87.6298° W
p <-ggmap(get_googlemap(center = c(lon = -87.629800, lat =41.878100 ),
                    zoom = 11, scale = 2,
                    maptype ='roadmap',
                    color = 'color', key = myKey))
## Source : https://maps.googleapis.com/maps/api/staticmap?center=41.8781,-87.6298&zoom=11&size=640x640&scale=2&maptype=roadmap&key=xxx
p + geom_point(aes(x = Pickup.Centroid.Longitude, y = Pickup.Centroid.Latitude), data = coc, size = 0.7) +
  theme(legend.position="bottom")
## Warning: Removed 5918 rows containing missing values (geom_point).

Drop-Off Density

Now let’s look at all the drop-off locations:

d <-ggmap(get_googlemap(center = c(lon = -87.629800, lat =41.878100 ),
                    zoom = 11, scale = 2,
                    maptype ='terrain',
                    color = 'color', key = myKey))
## Source : https://maps.googleapis.com/maps/api/staticmap?center=41.8781,-87.6298&zoom=11&size=640x640&scale=2&maptype=terrain&key=xxx
d + geom_point(aes(x = Dropoff.Centroid.Longitude, y = Dropoff.Centroid.Latitude), data = coc, size = 0.7) +
  theme(legend.position="bottom")
## Warning: Removed 6710 rows containing missing values (geom_point).

######### #REPEAT ABOVE WITH ZOOM ######### ### Pick-Up Density

Now let’s look at all the pick-up locations:

# Chicago = 41.8781° N, -87.6298° W
p <-ggmap(get_googlemap(center = c(lon = -87.629800, lat =41.878100 ),
                    zoom = 12, scale = 2,
                    maptype ='terrain',
                    color = 'color', key = myKey))
## Source : https://maps.googleapis.com/maps/api/staticmap?center=41.8781,-87.6298&zoom=12&size=640x640&scale=2&maptype=terrain&key=xxx
p + geom_point(aes(x = Pickup.Centroid.Longitude, y = Pickup.Centroid.Latitude), data = coc, size = 0.7) +
  theme(legend.position="bottom")
## Warning: Removed 15497 rows containing missing values (geom_point).

Drop-Off Density

Now let’s look at all the drop-off locations:

d <-ggmap(get_googlemap(center = c(lon = -87.629800, lat =41.878100 ),
                    zoom = 14, scale = 2,
                    maptype ='terrain',
                    color = 'color', key = myKey))
## Source : https://maps.googleapis.com/maps/api/staticmap?center=41.8781,-87.6298&zoom=14&size=640x640&scale=2&maptype=terrain&key=xxx
d + geom_point(aes(x = Dropoff.Centroid.Longitude, y = Dropoff.Centroid.Latitude), data = coc, size = 0.7) +
  theme(legend.position="bottom")
## Warning: Removed 38094 rows containing missing values (geom_point).

It is a bit tricky to see the density of the pickup and dropoffs because all the points are sitting on top of each other. Below sets the alpha variable which will make the dots transparent. This helps display the density of points plotted.

p +   geom_point(aes(x = Pickup.Centroid.Longitude, y = Pickup.Centroid.Latitude), colour = '#011f4b', data = coc, alpha=0.25, size = 0.5) + 
  theme(legend.position="none")
## Warning: Removed 15497 rows containing missing values (geom_point).

p + stat_density2d(
    aes(x = Pickup.Centroid.Longitude, y = Pickup.Centroid.Latitude,
        fill = ..level.., alpha = 0.15), size = 0.01, bins = 30,
    data = coc, geom = "polygon") #+
## Warning: Removed 15497 rows containing non-finite values (stat_density2d).

  #geom_point(aes(x = x, y = y, stroke = 2), colour=col4, data = n, size =1.5) + 
  #geom_label_repel(aes(x, y, label = label), data=n, family = 'Times', size = 3, box.padding = 0.2, point.padding = 0.3, segment.color = 'grey50') 
p + stat_density2d(
    aes(x = Pickup.Centroid.Longitude, y = Pickup.Centroid.Latitude, fill = ..level.., alpha = 0.25),
    size = 0.1, bins = 40, data = coc,
    geom = "polygon"
  ) +
  geom_density2d(data = coc, 
               aes(x = Pickup.Centroid.Longitude, y = Pickup.Centroid.Latitude), size = 0.3)
## Warning: Removed 15497 rows containing non-finite values (stat_density2d).

## Warning: Removed 15497 rows containing non-finite values (stat_density2d).

Move this to user interaction shiny application

I’m going to move this code to a shiny app so one can see the map change by selecting an hour of the day.

#END